home *** CD-ROM | disk | FTP | other *** search
- $IN = $ARGV[0] eq '' ? "-" : $ARGV[0];
- open(IN) || die "can't open $IN";
-
- $slave= $ARGV[0] eq ''; #is current file slave or main-one
- #$skip=0; #state: skipping or no
- #$nothdr=0; #recording mode; starts with recording header
- $level=-1; #mssg indent
- $mbnumber=''; #msgboard number/name
-
- ###require "msgflt.pl";
- #uses $skip, $slave, $nothdr
- #$slave= 1; #is current file slave or main-one
- $skip=0; #state: skipping or no
- $nothdr=0; #recording mode; starts with recording header
-
- sub filtline { #return 1 if line should be skipped
- if (m|\">Back to main board</A>|i) {
- $skip=0 if $skip==1; #stop skipping
- $skip=1 if $skip==2; #if skipping inside-message mssg-thread
- $slave++;
- # s|http:.+mbs\.cgi.acct\=mb(\d{6})\"|mb$1.htm\"|;
- s|<A HREF=\".+mb(\d{6}).*\"|<A HREF=\"mb$1.htm\"|;
- s|<B>||; s|</B>||;
- }
-
- return 1 if m|<!--|; #skip comments
- return 1 if m|^<blockquote>|; #skip whole heading
- return 1 if m/[^<]<img src="/; #skip any line with pictures
- #stop header accumulating
- if (m/\"#POSTNEW\"/) { $nothdr=1; return 1; } #skip goto-post-new-message
- if (m/r>rebuilt<c/i) { $nothdr=1; return 1; } #rebuilt version
-
- #skip listbox-menu-header:
- return 1 if m|javascript:if\(confirm\(\'|; #works if teleportPro used #http://www.insidetheweb.com/
- return 1 if m|[^<]<option|; #skip listbox-menu-0
- return 1 if m|^<option|; #skip listbox-menu
- return 1 if m|[^<]<input type=\"submit\"|; #skip listbox-menu-footer
-
- if (m|</SCRIPT>|) { $skip = 0; return 1; } #end skipping javascript
- if (m|</body>|) { $skip = 0; } #end skipping everything NOW
-
- $skip=1 if m|<SCRIPT language=\"javascript\">|; #start skipping javascript
- # $skip=1 if m|<FORM ACTION=|; #start skipping newmsg form
- $skip=1 if m|<center><table border=3><tr><td>|; #start skipping newmsg form
- $skip=2 if m|[^>]>Message thread:<|; #start skipping thread-view
-
- return 1 if $skip;
- $skip=1 if m|>Back to main board</A>|; #start skiping again/new-mssg table
- $skip=1 if m|^ *</TITLE>|; #start skiping after eo title
- $skip=1 if m|[^<]</TITLE>|; #start skiping after eo title
- return 0;
- }
- ###1; #needed by "require()" operator
- ####eo require
-
- $z = ''; #input line
- ## this doesn't work if one mssg-link is >1 line! (coz <IN> divides by CRLF)!
- sub getTree {
- #to be able to process already rebuilt .htm
- $level=0 if ($z =~ m/^ *<hr size=/i);
- #to be able to process already rebuilt .htm
- while ($z =~ m|^ *<ul>|i) { $level++; $z =~ s|^ *<ul>||i; }
- while ($z =~ m|^ *</ul>|i) { $level--; $z =~ s|^ *</ul>||i; }
- # if ($z =~ m|[^<]<font size=\"-1\"|i) { #.' face='
- if ($z =~ m|[^<]<A HREF=|i || $lineprev ne '') {
- $z =~ s/<BR>$//i;
- $z =~ s/\n$//;
- #for the new sectioned format
- $z =~ s/^[ \t]*<td>\d+ //i; #strip root messages (if numbered)
- $z =~ s/^[ \t]*<td>//i; #strip root messages (notnumbered)
- $z =~ s|</td>||gi; #strip -/-
- if ($lineprev ne '') {
- $z = $lineprev.$z; $lineprev = '';
- } elsif ($z =~ m| *<A HREF=\"[^\"]+\">$|i) {
- $lineprev=$z;
- return;
- }
- #eo for the new sectioned format
-
- $z =~ s/^ *<hr size=.>//i; #strip
- $z =~ s"(<TABLE WIDTH=100%><TR><TD>|</TD></TR></TABLE>)""gi;
- $z =~ s/<font size=\".{1,2}\" face=\"arial,helvetica\">//gi;
- $z =~ s/<FONT SIZE=\".{1,2}\">//gi;
- $z =~ s"(<B>|</B>|</font>)""gi;
- (@q) = split( /<A HREF=/i, $z);
- $q[0] =~ s/^ +//; #strip trailing spaces
- # <href mail> name</a>: <href mssg> subj </a> (n/t) date : old/original
- # <href mssg> subj</a> (n/t) (<href mail> name</a>) date : new
- # newest: date is yyyy
- $old = !($q[0] eq '' && $q[1] =~ /^ *\"\d{8}\..+\">/);
- ## print $old;
- if ($old) {
- if ($#q >= 2) { #href mail available
- $msgsbj = $q[2]; ($mail,$name) = split( /\">/, $q[1]);
- } else { $mail=''; $name = $q[0]; $msgsbj = $q[1]; }
- ($mssg,$subj) = split( /\">/, $msgsbj);
- $name =~ s/: *$//; $name =~ s|</A>$||i; #strip :whitespace at end
- ($subj,$date) = split( m|</A>|i, $subj);
- } else {
- if ($#q >= 2) { #href mail available
- ($mail,$name) = split( /\">/, $q[2]);
- ($name,$date) = split( m|</A>|i, $name);
- $msgsbj = $q[1];
- $nt = ($msgsbj =~ s| +\(n/t\) *||i);
- } else {
- $mail='';
- ($msgsbj,$name) = split( m|</A>|i, $q[1]);
- $name =~ s/^ +//; #heading spaces
- $nt = ($name =~ s|\(n/t\) +||i);
- (@xx) = split( m| \(|, $name);
- $date = pop(@xx);
- $date = "($date" if ($date !~ /^\(/);
- $name = join( " (", @xx);
- $name =~ s/^ *\(//; $name =~ s/\)$//;
- }
- $date =~ s/^\) *//; #strip
- $date = "(n/t) $date" if $nt;
- ($mssg,$subj) = split( /\">/, $msgsbj);
- $subj =~ s|</A> *\(*$||i; #strip
- }
- $mssg =~ s/^\"//; #strip
- $mail =~ s/\"mailto://;
- $date =~ s/^ +//; #strip heading spaces
- $date =~ s/-(\d\d)(\d\d) /-$2 /; #plz 2 digit year
- if (0) {
- print "$z\n";
- for ($t=0; $t<=$#q; $t++) { print "$t:$q[$t].\n"; } print "=====\n";
- print "mail:$mail\nname:$name\nmssg:$mssg\nsubj:$subj\ndate:$date\n---\n";
- }
- $saver[$nmsg] = join( "\f", $level,$mail,$name,$mssg,$subj,$date ); # ,$z
- $saver[$nmsg] =~ s/ +\f/\f/g;
- $nmsg++;
- } else {
- $lineprev = ''; } #just in case
- }
-
-
-
- while (<IN>) { #if using (<>) $ARGV is current file's name
- #new sectioned fmt/changed things
- if (/^<p>\s*$/i && 5==$nothdr) { $skip=0; $nothdr=0; next; } #stop skiping/topmost listbox
- next if &filtline ;
-
- (@x) = split(/\"/);
- $z='';
- for ($i=0; $i<=$#x; $i++) {
- if ($x[$i] eq " tppabs=") { $i++; next; } #skip tppabs & it's value
-
- if ($x[$i] =~ /mbs.cgi.acct=mb/) {
- # (@y) = split(/\&/);
- (@y) = split(/\&/,$x[$i]);
- if ($mbnumber eq '') { #get main msgboard number
- $mbnumber=$y[0];
- $mbnumber =~ s/.+mbs.cgi.acct=//i;
- }
- loop1: #locate and get current message number
- for ($k=1; $k<$#y; $k++) {
- ($mynum,$nn) = split(/\=/, $y[$k]);
- if ($mynum eq 'MyNum') {
- $longnm = $x[$i]; #912345678 -> 12345678.9 #last 8 digits only
- $x[$i] = substr($nn,-8).".".substr($nn,-9,1);
- $longnames{ $x[$i] } = $longnm;
- last loop1;
- }
- }
- }
- $z .= '"' if $i;
- $z .= $x[$i];
- }
-
- # print $z;
-
- $z =~ s/<BR>[^>\n]/<BR>\n/; #??
- $hdr[$h++] = $z if !$nothdr;
- #new sectioned fmt/changed things
- #start skiping/topmost listbox 1st time only
- if ($z =~ m/^<body/i && !$nothdr) { $skip=1; $nothdr=5; next; }
-
- do getTree() if !$slave; #now tree extracting:
- }
- close IN;
-
- ################
-
- if (!$slave) {
- $mbnumber = "mbnumber" if $mbnumber eq '';
- print "messageboard: $mbnumber\n";
- $IN = "$mbnumber.tre";
- if (open(IN)) {
- print "old tree: $IN\n";
- $i=0; while (<IN>) {
- s/[\r\n]+$//g; s/ +\f/\f/g; ###$_ = substr($_,2);
- $xa{$_} = 1+$i; $ya[$i] = $_; $i++;
- } close IN;
- }
-
- $OUT = ">../$IN";
- open(OUT) || die "can't open $OUT";
- print "new tree: $OUT\n";
-
- $news=0;
- #relies on unique items/lines! if same line is repeated, result is unpredictable
- for ($ia=$ib=0; $ib<=$#saver && $ia<=$#ya; $ib++) {
- $n = $xa{ $saver[$ib] };
- if ($n) { #(!) it is 1 more
- for ( ;$ia<$n;$ia++) { print OUT "$ya[$ia]\n"; }
- ($level,$mail,$name,$mssg,$subj,$date) = split( "\f", $saver[$ib]);
- undef $longnames{$mssg}; #remove available messages
- } else {
- print OUT "$saver[$ib]\n";
- $news++;
- }
- }
- for ( ;$ib<=$#saver;$ib++) { print OUT "$saver[$ib]\n"; $news++; }
- for ( ;$ia<=$#ya;$ia++) { print OUT "$ya[$ia]\n";
- ($level,$mail,$name,$mssg,$subj,$date) = split( "\f", $ya[$ia]);
- undef $longnames{$mssg}; #remove available messages
- }
- close OUT;
-
- # $OUT = ">re_name.bat"; open(OUT) || die "can't open $OUT";
- $OU2 = ">getonly.htm";
- open(OU2) || die "can't open $OU2";
- print "page to retrieve new messages only: $OU2 ($news)\n";
- for ($h=0; $h<=$#hdr; $h++) { print OU2 $hdr[$h]; } #prologue
- $SITE = "http://www.insidetheweb.com/messageboard/";
- foreach $k (keys %longnames) { #only missing messages
- $http = $longnames{$k};
- $http =~ s|$SITE||i; #remove URL if any
- $kk = $k; $kk =~ s/\.9/\.HTM/;
- # print OUT "type \"$http\" | perl $0 >..\\$kk\n" if $http ne '';
- $http =~ s/mbs\.cgi./mbs\.cgi\?/i;
- $http =~ s/(MyNum=)(\d+).*$/$1$2&TL=$2&P=No/; #avoid thread above
- ### $http =~ $SITE.$http if $http !~ m|$SITE|;
- print OU2 "<A HREF=\"$SITE$http\"> $k </A><BR>\n" if $longnames{$k} ne '';
- }
- print OU2 "</body></html>\n"; #epilogue
- close OU2;
-
- $OUT = ">../$mbnumber.hdr"; #prologue
- open(OUT) || die "can't open $OUT";
- print "header: $OUT\n";
- for ($h=0; $h<=$#hdr; $h++) { print OUT $hdr[$h]; }
- close OUT;
- } else { #if used over single message... but better use msg2.pl
- for ($h=0; $h<=$#hdr; $h++) { print $hdr[$h]; }
- }
-
- #derived structure:
- # struct { thread-start MyNum=NNN/TL=NNN; //older msgb:TL is missing
- # -m1. MyNum=XXX/TL=NNN
- # -m2 MyNum=YYY/TL=NNN
- # ... which is where depends on <ul> and </ul> found
- # <ul> indents one level; </ul> outdents
- # } <HR size=0>
- #so possible actions:
- # if parser saves the tree, tree+separate messages are enough to rebuild;
- # then, get the new tree, merge, add new messages
- # SvD 01'99
-